{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import lightgbm as lgb\n",
    "import os\n",
    "from tqdm import tqdm\n",
    "from sklearn.model_selection import KFold\n",
    "from sklearn.metrics import f1_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "trn_path = 'hy_round1_train_20200102'\n",
    "test_path = 'hy_round1_testA_20200102'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 7000/7000 [00:41<00:00, 166.70it/s]\n",
      "100%|██████████| 2000/2000 [00:12<00:00, 163.77it/s]\n"
     ]
    }
   ],
   "source": [
    "def get_data(path, get_type=True):\n",
    "    features = []\n",
    "    for file in tqdm(os.listdir(path)):\n",
    "        file_path = os.path.join(path, file)\n",
    "        df = pd.read_csv(file_path)\n",
    "        if get_type:\n",
    "            features.append([df['x'].std(), df['x'].mean(),\n",
    "                             df['y'].std(), df['y'].mean(),\n",
    "                             df['速度'].mean(), df['速度'].std(), \n",
    "                             df['方向'].mean(), df['方向'].std(),\n",
    "                             file,\n",
    "                             df['type'][0]])\n",
    "        else:\n",
    "            features.append([df['x'].std(), df['x'].mean(),\n",
    "                             df['y'].std(), df['y'].mean(),\n",
    "                             df['速度'].mean(), df['速度'].std(), \n",
    "                             df['方向'].mean(), df['方向'].std(),\n",
    "                             file])\n",
    "    df = pd.DataFrame(features)\n",
    "    if get_type:\n",
    "        df = df.rename(columns={len(features[0])-1:'label'})\n",
    "        df = df.rename(columns={len(features[0])-2:'filename'})\n",
    "        label_dict = {'拖网':0, '刺网':1, '围网':2}\n",
    "        df['label'] = df['label'].map(label_dict)\n",
    "    else:\n",
    "        df = df.rename(columns={len(features[0])-1:'filename'})\n",
    "    \n",
    "\n",
    "    return df\n",
    "df_train = get_data(trn_path)\n",
    "df_test = get_data(test_path, False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "folds = KFold(n_splits=5, shuffle=True, random_state=2019)\n",
    "oof_lgb = np.zeros((len(df_train),3))\n",
    "col = [tmp_col for tmp_col in df_train.columns if tmp_col not in ['label', 'filename']]\n",
    "X_train = df_train[col].values\n",
    "y_train = df_train['label'].values\n",
    "prediction = np.zeros((len(df_test),3))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "fold n°1\n",
      "Training until validation scores don't improve for 100 rounds.\n",
      "[500]\ttraining's multi_logloss: 0.226924\tvalid_1's multi_logloss: 0.326774\n",
      "[1000]\ttraining's multi_logloss: 0.145244\tvalid_1's multi_logloss: 0.313612\n",
      "Early stopping, best iteration is:\n",
      "[1324]\ttraining's multi_logloss: 0.111339\tvalid_1's multi_logloss: 0.311507\n",
      "fold n°2\n",
      "Training until validation scores don't improve for 100 rounds.\n",
      "[500]\ttraining's multi_logloss: 0.225747\tvalid_1's multi_logloss: 0.352029\n",
      "[1000]\ttraining's multi_logloss: 0.142032\tvalid_1's multi_logloss: 0.336953\n",
      "Early stopping, best iteration is:\n",
      "[1221]\ttraining's multi_logloss: 0.117995\tvalid_1's multi_logloss: 0.333692\n",
      "fold n°3\n",
      "Training until validation scores don't improve for 100 rounds.\n",
      "[500]\ttraining's multi_logloss: 0.228576\tvalid_1's multi_logloss: 0.339163\n",
      "[1000]\ttraining's multi_logloss: 0.146269\tvalid_1's multi_logloss: 0.322189\n",
      "Early stopping, best iteration is:\n",
      "[1149]\ttraining's multi_logloss: 0.129774\tvalid_1's multi_logloss: 0.320121\n",
      "fold n°4\n",
      "Training until validation scores don't improve for 100 rounds.\n",
      "[500]\ttraining's multi_logloss: 0.224412\tvalid_1's multi_logloss: 0.344711\n",
      "[1000]\ttraining's multi_logloss: 0.140956\tvalid_1's multi_logloss: 0.332567\n",
      "Early stopping, best iteration is:\n",
      "[1127]\ttraining's multi_logloss: 0.127273\tvalid_1's multi_logloss: 0.332026\n",
      "fold n°5\n",
      "Training until validation scores don't improve for 100 rounds.\n",
      "[500]\ttraining's multi_logloss: 0.222461\tvalid_1's multi_logloss: 0.362564\n",
      "[1000]\ttraining's multi_logloss: 0.13913\tvalid_1's multi_logloss: 0.341437\n",
      "Early stopping, best iteration is:\n",
      "[1163]\ttraining's multi_logloss: 0.121438\tvalid_1's multi_logloss: 0.339885\n"
     ]
    }
   ],
   "source": [
    "param = {'num_leaves': 31,\n",
    "#          'min_data_in_leaf': 30, \n",
    "         'objective':'multiclassova',\n",
    "         'num_class':3,\n",
    "         'learning_rate': 0.01,\n",
    "#          \"min_child_samples\": 20,\n",
    "         \"boosting\": \"gbdt\",\n",
    "         \"feature_fraction\": 0.9,\n",
    "#          \"bagging_freq\": 1,\n",
    "         \"bagging_fraction\": 0.9 ,\n",
    "#          \"bagging_seed\": 11,\n",
    "#          \"metric\": 'mse',\n",
    "         \"lambda_l1\": 0.1,\n",
    "         \"verbosity\": -1}\n",
    "\n",
    "for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train)):\n",
    "    print(\"fold n°{}\".format(fold_+1))\n",
    "    trn_data = lgb.Dataset(X_train[trn_idx], y_train[trn_idx])\n",
    "    val_data = lgb.Dataset(X_train[val_idx], y_train[val_idx])\n",
    "\n",
    "    num_round = 2000\n",
    "    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=500, early_stopping_rounds = 100)\n",
    "    oof_lgb[val_idx] = clf.predict(X_train[val_idx], num_iteration=clf.best_iteration)\n",
    "    prediction += clf.predict(df_test[col].values, num_iteration=clf.best_iteration)\n",
    "#     oof_lgb2 += clf.predict(df_remove[col].values, num_iteration=clf.best_iteration)\n",
    "oof_lgb_final = np.argmax(oof_lgb, axis=1)  \n",
    "#     predictions_lgb += clf.predict(X_test, num_iteration=clf.best_iteration) / folds.n_splits"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.8356443698590562"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "f1_score(y_train, oof_lgb_final, average='macro')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "pred_label = np.argmax(prediction, axis=1)\n",
    "label_dict = {0:'拖网', 1:'刺网', 2:'围网'}\n",
    "df_test['filename'] = df_test['filename'].apply(lambda x:x[0:4])\n",
    "df_pred = pd.DataFrame()\n",
    "df_pred['filename'] = df_test['filename']\n",
    "df_pred['label'] = pred_label\n",
    "df_pred['label'] = df_pred['label'].map(label_dict)\n",
    "df_pred.to_csv('sub2.csv', index=None, header=False)    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
